缺失数据
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df)
# 输出结果:
# one two three
# a -2.160823 1.205748 -0.903059
# b NaN NaN NaN
# c 0.982933 -1.107031 2.163404
# d NaN NaN NaN
# e -0.351059 0.493755 -0.096658
1> 检查缺失值
isnull() 和 notnull() 函数
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df['one'].isnull())
# 输出结果:
# a False
# b True
# c False
# d True
# e False
# Name: one, dtype: bool
print(df['two'].notnull())
# 输出结果:
# a True
# b False
# c True
# d False
# e True
# Name: two, dtype: bool
2> 缺失值的计算
在数据求和时,NaN将被视为 0, 如果数据全部是 NaN,那么结果是 NaN
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:{df}')
# 输出结果:
# 原数组: one two three
# a 0.024880 -0.615047 0.335393
# b NaN NaN NaN
# c 0.630599 -1.647667 -0.260803
# d NaN NaN NaN
# e 0.845244 0.391965 -2.285933
print(f"数组求和1 : {np.sum(df['two'])}")
# 输出结果:
# 数组求和1 : -1.870749246793861
print(f'数组求和2 : {df["two"].sum()}')
# 输出结果:
# 数组求和2 : -1.870749246793861
3> 缺失数据填充
fillna()函数
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:\n{df}')
# 输出结果:
# 原数组:
# one two three
# a 1.474877 0.180455 -0.104257
# b NaN NaN NaN
# c -0.724581 2.251094 -0.073617
# d NaN NaN NaN
# e 1.950274 0.381642 -0.320262
print(f"用 0 填充 NaN 后的数组 :\n {df.fillna(0)}")
# 输出结果:
# 用 0 填充 NaN 后的数组 :
# one two three
# a 1.474877 0.180455 -0.104257
# b 0.000000 0.000000 0.000000
# c -0.724581 2.251094 -0.073617
# d 0.000000 0.000000 0.000000
# e 1.950274 0.381642 -0.320262
print(f'用 3 填充 NaN 后的数组 :\n {df.fillna(3)}')
# 输出结果:
# 用 3 填充 NaN 后的数组 :
# one two three
# a 1.474877 0.180455 -0.104257
# b 3.000000 3.000000 3.000000
# c -0.724581 2.251094 -0.073617
# d 3.000000 3.000000 3.000000
# e 1.950274 0.381642 -0.320262
4> 向前或向后填充
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(f'原数组:\n{df}')
# 输出结果:
# 原数组:
# one two three
# a -0.322989 1.459974 0.246409
# b NaN NaN NaN
# c 1.428074 0.103128 0.813241
# d NaN NaN NaN
# e 0.031593 1.016594 2.086443
print(f"用 0 填充 NaN 后的数组 :\n {df.fillna(method = 'pad')}")
# 输出结果:
# 用 0 填充 NaN 后的数组 :
# one two three
# a -0.322989 1.459974 0.246409
# b -0.322989 1.459974 0.246409
# c 1.428074 0.103128 0.813241
# d 1.428074 0.103128 0.813241
# e 0.031593 1.016594 2.086443
print(f'用 3 填充 NaN 后的数组 :\n {df.fillna(method = "backfill")}')
# 输出结果:
# 用 3 填充 NaN 后的数组 :
# one two three
# a -0.322989 1.459974 0.246409
# b 1.428074 0.103128 0.813241
# c 1.428074 0.103128 0.813241
# d 0.031593 1.016594 2.086443
# e 0.031593 1.016594 2.086443
5> 清除缺失值
在默认情况下,axis = 0,即在行上应用;
如果行内的任何值是 NaN,那么整行被排除
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df.dropna())
# 输出结果:
# one two three
# a -2.129920 -1.001321 -1.522123
# c 0.182309 0.285505 1.195824
# e -1.163222 -0.938291 0.151461
print(df.dropna(axis = 1))
# 输出结果:
# Empty DataFrame
# Columns: []
# Index: [a, b, c, d, e]
6> 值替换
df = pd.DataFrame(np.random.randn(3,3), index = ['a','c','e'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e'])
print(df.replace({np.NaN:0.14}))
# 输出结果:
# one two three
# a 0.844849 0.308399 0.428219
# b 0.140000 0.140000 0.140000
# c -1.915478 0.713083 -0.365932
# d 0.140000 0.140000 0.140000
# e -0.291266 -1.076788 1.454755